import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# from nltk.tokenize import RegexpTokenizer
from nltk.metrics import edit_distance
# from nltk.stem import WordNetLemmatizer
# from nltk.stem.porter import PorterStemmer
# from nltk.util import ngrams
# from nltk import pos_tag
import string
import spacy
import time
import matplotlib.pyplot as plt
import seaborn as sns
df_raw = pd.read_csv("talent.csv",encoding = "ISO-8859-1")
# Convert column names
cols = ['index_col','education','talents','goal','team','work_remotely','start_date','platforms','pick_startups','pick_teams','add_value','achievements','ideal_job','extra']
df_raw.columns = cols
# Setting index
df_raw.index_col = df_raw.index_col.str.replace(',','').astype('int64')
df_raw.set_index('index_col', inplace=True)
df_raw.head(1)
print(df_raw.info())
print('-'*40)
print(df_raw.isnull().sum())
# Drop mostly null & useless columns
df_raw = df_raw.drop(['goal','team','work_remotely','start_date','platforms','pick_startups','pick_teams','ideal_job'], axis=1)
df_raw = df_raw.drop_duplicates(keep='first')
display(df_raw.head(1))
# Separate the education column for easier analysis
df_new = df_raw.loc[:3256]
df_new = df_new.fillna(value='')
print(df_new.shape)
df_new.head()
str_punc_list = list(string.punctuation) + ['。',',','(',')',':',';']
def remove_punctuation_list(text):
text = text.lower()
no_punct = "".join([c if c not in str_punc_list else ' ' for c in text])
return no_punct
df_new['education'] = df_new.education.apply(remove_punctuation_list)
df_new.head()
df_new['education'] = df_new.education.apply(word_tokenize)
df_new.head()
bachelor_list = ['b','ba','bs', 'bba', 'basc', 'bas', 'bse', 'bsc',
'bsba', 'bsb', 'ibs', 'be', 'bfa', 'btech', 'bcom']
university_list = ['undergraduate','virginia', 'college','pomona',
'rutger', 'rutgers','uc', 'ucla', 'ucsd',
'uconn', 'uchicago', 'ucsb', 'ucl', 'uci','uiuc','byu']
ba_uni_syn_list = ['bachelor', 'university', 'institute', 'polytechnic']
master_list = ['m','ma','ms', 'msc', 'mscs', 'mse', 'msi', 'msf', 'msit',
'msim', 'mba', 'msba', 'msis', 'msu', 'msmis', 'msm',
'master', 'masters','micromaster', 'micromasters']
mba_list = ['mba']
phd_list = ['phd','ph','doctor']
ba_uni_list = bachelor_list + university_list
# 1 - bachelor
# 2 - master
# 3 - mba
# 4 - phd
degree_dict = { 4 : phd_list,
3 : mba_list,
2 : master_list,
1 : ba_uni_list}
# degree_syn_dict = { 2 : ['master'],
# 1 : ba_uni_syn_list}
def find_degree(word_list):
for (key, value) in degree_dict.items():
for v in value:
if v in word_list:
return key
for i in ba_uni_syn_list:
for w in word_list:
if edit_distance(i, w) < 4:
return 1
return 0
df_new['degree'] = df_new.education.apply(find_degree)
df_new.pivot_table(index='degree', values='education', aggfunc='count')
#load the spacy module
nlp = spacy.load('en_core_web_sm')
#list of stopwords in english
stopwords_list = stopwords.words('english') + ['-PRON-']
def remove_stopwords(text):
no_stopwords = [w for w in text if w not in stopwords_list]
return no_stopwords
#use spacy to lemmatize each word
def lemma(text):
text = remove_punctuation_list(text.lower())
doc = nlp(text)
doc_lemma = " ".join(token.lemma_ for token in doc)
word_list = word_tokenize(doc_lemma)
word_list = remove_stopwords(word_list)
return word_list
df_new.head()
for col in ['talents', 'add_value', 'achievements', 'extra']:
df_new[col] = df_new[col].apply(lemma)
df_new
data_skill = ['sql','python','r','tableau','sas','spark','scala','database',
'ml','scikit','regression','forest','classify','statistics','statistical','visualization',
'analysis','analytic','mine','predictive','prescriptive','nlp']
software_skill = ['python','java','c++','linux','c','oracle','software','algorithm']
web_skill = ['html','css','javascript','php']
design_skill = ['ui','ux','uiux','design','adobe','photoshop','illustrator','ps']
# 1 - Data Scientist
# 2 - Software Engineer
# 3 - Web Developer
# 4 - Designer
skill_dict = {1: data_skill,
2: software_skill,
3: web_skill,
4: design_skill}
long_skill_dict = {1: ['machine learning','deep learning','data mining','data modeling','data analytic','business analytic',
'data scientist','business intelligence','data cleaning','natural language processing'],
2: ['develop app','computer science','computer engineering','web services','data structure',
'software engineer','software development','software engineering','software developer'],
3: ['back-end','front-end','web app','mobile app','webpage design','website design','information architecture',
'web programming','web developer','web application','web design','web applications','information technology'],
4:['user interface','user experience','logo design','graphic design']}
def find_skill_score(text):
skill_score = np.zeros(4)
for (key, val) in skill_dict.items():
for v in val:
if v in text:
skill_score[key-1]+=1
for (key, val) in long_skill_dict.items():
for v in val:
if v in " ".join(text):
skill_score[key-1]+=1
return skill_score
def find_max(arr):
if (arr == np.zeros(4)).all():
return 0
else:
return arr.argmax() + 1
df_talent_score = df_new['talents'].apply(find_skill_score)
df_value_score = df_new['add_value'].apply(find_skill_score)
df_extra_score = df_new['extra'].apply(find_skill_score)
df_new['skill_score'] = df_talent_score + df_value_score + df_extra_score
df_new['job_id'] = df_new.skill_score.apply(find_max)
df_new.pivot_table(index='job_id',values='education',aggfunc='count')
deg_dict = {1 : 'Bachelor', 2 : 'Master', 3 : 'MBA', 4: 'Ph.D.'}
job_dict = {1 : 'Data Scientist', 2 : 'Software Engineer', 3 : 'Web Developer', 4: 'Designer', 0:'Not specified'}
df_new['deg_name'] = df_new.degree.map(deg_dict)
df_new['job_name'] = df_new.job_id.map(job_dict)
df_new.head()
df_new.to_csv('talent_deg_job.csv')
df_new = pd.read_csv('talent_deg_job.csv', index_col=0)
import plotly.graph_objects as go
deg_count = df_new.degree.value_counts()
ba_count = deg_count[1]
ma_count = deg_count[2]
mba_count = deg_count[3]
phd_count = deg_count[4]
deg_tot = deg_count.sum()
job_count = df_new.job_id.value_counts()
ds_count = job_count[1]
sde_count = job_count[2]
wd_count = job_count[3]
d_count = job_count[4]
job_tot = job_count.sum()
import plotly.offline as pyo
pyo.init_notebook_mode()
fig = go.Figure()
fig.add_trace(go.Indicator(
mode = "number",
value = ba_count,
number = {'font': {'color': 'blue'}},
title = {"text": "Bachelor's Degree"},
domain = {'x': [0, 1/3], 'y': [0.66, 0.86]}))
fig.add_trace(go.Indicator(
mode = "number",
value = ma_count,
number = {'font': {'color': 'blue'}},
title = {"text": "Master Degree"},
domain = {'x': [1/3, 2/3], 'y': [0.66, 0.86]}))
fig.add_trace(go.Indicator(
mode = "number",
value = phd_count,
number = {'font': {'color': 'blue'}},
title = {"text": "Ph.D."},
domain = {'x': [2/3, 1], 'y': [0.66, 0.86]}))
fig.add_trace(go.Indicator(
mode = "number",
value = ds_count,
number = {'font': {'color': 'orange'}},
title = {"text": "Data Scientist"},
domain = {'x': [0, 1/2], 'y': [0.33, 0.53]}))
fig.add_trace(go.Indicator(
mode = "number",
value = sde_count,
number = {'font': {'color': 'orange'}},
title = {"text": "Software Engineer"},
domain = {'x': [1/2, 1], 'y': [0.33, 0.53]}))
fig.add_trace(go.Indicator(
mode = "number",
value = wd_count,
number = {'font': {'color': 'orange'}},
title = {"text": "Web Developer"},
domain = {'x': [0, 1/2], 'y': [0, 0.2]}))
fig.add_trace(go.Indicator(
mode = "number",
value = d_count,
number = {'font': {'color': 'orange'}},
title = {"text": "Web Developer"},
domain = {'x': [1/2, 1], 'y': [0, 0.2]}))
# fig.update_layout(paper_bgcolor = "lightgray")
fig.show()
sns.color_palette(palette = 'colorblind')
fig = plt.figure(figsize=(20,6))
ax1 = fig.add_subplot(1,2,1)
fig1 = sns.countplot(x='deg_name', data=df_new, ax=ax1)
fig1.set(xlabel='Degree Name', ylabel='Number of People with Each Degree', title='Degree Distribution')
for f in fig1.patches:
h = f.get_height()
fig1.text(f.get_x() + f.get_width()/2., h+20, h ,ha="center")
ax2 = fig.add_subplot(1,2,2)
fig2 = sns.countplot(x='job_name', data=df_new, ax=ax2)
fig2.set(xlabel='Skill Name', ylabel='Number of People with Each Skill', title='Skill Distribution')
for f in fig2.patches:
h = f.get_height()
fig2.text(f.get_x() + f.get_width()/2., h+20, h ,ha="center")
;
Are the following data misplaced?
3 hackathon winners
df_new.loc[3257:,:]
import pandas as pd
import numpy as np
import holoviews as hv
import plotly.graph_objects as go
import plotly.express as pex
hv.extension('bokeh')
df_grouped = df_new.groupby(by=["deg_name","job_name"]).size().to_frame('size')
df_grouped = df_grouped.reset_index()
print(df_grouped)
plot1 = hv.Sankey(df_grouped)
plot1.opts(cmap='Colorblind',label_position='left',
edge_color='job_name', edge_line_width=0,
node_alpha=1.0, node_width=40, node_sort=True,
width=800, height=600, bgcolor="snow",
title="Forkaia Talent Snapshot")